package focusedCrawler.target.classifier;
import java.nio.file.Path;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import com.fasterxml.jackson.core.JsonProcessingException;
import com.fasterxml.jackson.databind.JsonNode;
import com.fasterxml.jackson.databind.ObjectMapper;
import focusedCrawler.target.model.Page;
/**
* Classify whether a page is relevant to a topic by matching a RegExp against the title.
*/
public class TitleRegexTargetClassifier implements TargetClassifier {
private Pattern pattern;
public TitleRegexTargetClassifier(String regex) {
regex = ".*" + regex + ".*";
this.pattern = Pattern.compile(regex, Pattern.CASE_INSENSITIVE);
}
@Override
public TargetRelevance classify(Page page) throws TargetClassifierException {
if(regexMatchesTitle(page)) {
return TargetRelevance.RELEVANT;
} else {
return TargetRelevance.IRRELEVANT;
}
}
public boolean regexMatchesTitle(Page page) {
String title = page.getParsedData().getTitle();
if (title != null) {
Matcher matcher = this.pattern.matcher(title);
if (matcher.matches()) {
return true;
} else {
return false;
}
} else {
return false;
}
}
static class TitleRegexClassifierConfig {
public String regular_expression;
}
public static class Builder {
public TargetClassifier build(Path basePath, ObjectMapper yaml, JsonNode parameters) throws JsonProcessingException {
TitleRegexClassifierConfig params = yaml.treeToValue(parameters, TitleRegexClassifierConfig.class);
if (params.regular_expression != null && !params.regular_expression.trim().isEmpty()) {
return new TitleRegexTargetClassifier(params.regular_expression.trim());
} else {
return null;
}
}
}
}